# from google.colab import drive
# drive.mount('/content/drive')
# ! pip install swifter
# ! pip install matplotlib==3.4.0
# ! pip install textacy
# ! pip install thinc
# ! pip install gensim
# ! pip install pyLDAvis
# !python -m spacy download en_core_web_lg
# # trzeba uruchomić ponownie środowisko wykonawcze po pobraniu
import pandas as pd
import numpy as np
import spacy
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
en = spacy.load("en_core_web_lg")
import os
import pickle
from collections import Counter
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
import plotly.express as px
import plotly.offline as py
py.init_notebook_mode()
#DIR = '/content/drive/MyDrive/NLP-klimat/'
DIR = '../NLP-klimat/'
Teksty zostały odczytane z PDF-ów na podstawie wcześniejszego otagowania poszczególnych dokumentów.
NECP_annotations = pd.read_csv(DIR+'NECP.txt')
NECP_annotations = NECP_annotations.replace({"None": None})
NECP - National Energy and Climate Plan (Krajowy plan na rzecz energii i klimatu)
Aby zrealizować ustanowione przez Unię Europejską cele w zakresie energii i klimatu na 2030 rok, państwa członkowskie zostały zobowiązane do ustanowienia 10-letniego planu na rzecz energii i klimatu na okres od 2021 do 2030 roku (NECP).
Struktura NECP

Zatem dla każdego z 27 państw członkowskich otrzymujemy sekcje:
Według wzorcowej struktury sekcje 2-5 powinny byc podzielone na 5 wymiarów:
W rzeczywistości w większości planów w sekcji oceny wpływu planowanych działań na rzecz klimatu nie ma podziału na 5 wymiarów
necp_processed = pd.read_csv(DIR+'necp_processed.csv', index_col = 0)
Kolumny zaimportowanej ramki danych.
necp_processed.columns
necp_processed.drop(['start_page', 'end_page', 'start_text', 'end_text'], axis = 1, inplace = True)
necp_processed.drop(necp_processed[necp_processed.isnull()["text"]].index, axis = 0, inplace = True)
len(necp_processed)
Zostało 453 części dokumentów.
import swifter
import warnings
warnings.filterwarnings("default")
# tqdm.pandas()
# necp_docs = necp_processed['text'].swifter.apply(en)
# # eksport przetworzonych dokumentów
# with open(DIR + 'necp_docs_lg.pickle', 'wb') as f:
# pickle.dump(necp_docs, f)
# import przetworzonych dokumentów
with open(DIR + 'necp_docs_lg.pickle', 'rb') as f:
necp_docs_2 = pickle.load(f)
necp_docs = necp_docs_2
countries_stop_words = ['Austria', 'Austrian', 'Belgium', 'Belgian', 'Bulgaria', 'Bulgarian', 'Czech', 'Cyprus', 'Cypriot', 'Germany', 'German',
'Denmark', 'Danish', 'Estonia', 'Estonian', 'Croatia', 'Croatian', 'Finland', 'Finnish', 'France', 'French', 'Malta', 'Maltese',
'Luxembourg', 'Lithuania', 'Lithuanian', 'Latvia', 'Latvian', 'Italy', 'Italian', 'Ireland', 'Irish', 'Hungary', 'Hungarian',
'Greece', 'Greek', 'Spain', 'Spanish', 'Netherlands', 'Dutch', 'Poland', 'Polish', 'Portugal', 'Portuguese', 'Romania', 'Romanian',
'Sweden', 'Swedish', 'Slovenia', 'Slovenian', 'Slovakia', 'Slovak']
extra_stop_words = ['energy', 'figure', 'table', 'plan', "necp", 'national', 'use', "measure", "sector", "climate",
"plan", "dimension", "integrated", "section", "republic", "measures", "policies", "target", "objective", "policy",
"projection", "assessment", "federal", "government"]
necp_processed["necp_lemmas"] = necp_docs.swifter.apply(lambda doc: [token.lemma_ for token in doc
if not token.is_stop
if not token.is_punct
if not (token.lemma_ in countries_stop_words)
if not (token.lemma_.lower() in extra_stop_words)
if token.is_alpha])
from gensim.models import Phrases
bigram = Phrases(necp_processed["necp_lemmas"], min_count=20)
for idx in necp_processed["necp_lemmas"].index:
for token in bigram[necp_processed["necp_lemmas"][idx]]:
if '_' in token:
necp_processed["necp_lemmas"][idx].append(token)
def plot_counter(counter: Counter, orient: str = 'h', color: str='lightblue', figsize: tuple=(20,13)):
plt.figure(figsize=figsize)
keys = [k[0] for k in counter]
vals = [int(k[1]) for k in counter]
ax = sns.barplot(x=vals, y=keys, orient=orient, color=color)
ax.bar_label(ax.containers[0])
return ax
from gensim.models import CoherenceModel
decarbonisation_docs = necp_processed[(necp_processed['energy_union_dimension'] == "Decarbonisation")]["necp_lemmas"]
decarbonisation_counter = Counter(decarbonisation_docs.sum()).most_common(30)
plot_counter(decarbonisation_counter)
plt.show()
decarbonisation_docs = decarbonisation_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['emission', 'renewable'])])
decarbonisation_dictionary = Dictionary(decarbonisation_docs)
decarbonisation_dictionary.filter_extremes(no_below=2, no_above=1.0)
decarbonisation_encoded_docs = decarbonisation_docs.apply(decarbonisation_dictionary.doc2bow)
decarbonisation_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(decarbonisation_encoded_docs, num_topics=topics_number, passes=8, iterations=100, random_state=123)
decarbonisation_models.append(lda)
decarbonisation_cvs = []
for model in tqdm(decarbonisation_models):
cm = CoherenceModel(model,texts=decarbonisation_docs, dictionary=decarbonisation_dictionary)
c_v = cm.get_coherence()
decarbonisation_cvs.append(c_v)
px.line(x=range(3, 13), y=decarbonisation_cvs)
vis = pyLDAvis.gensim_models.prepare(decarbonisation_models[4], decarbonisation_encoded_docs, dictionary=decarbonisation_dictionary)
vis
for idx, topic in decarbonisation_models[4].show_topics(formatted=False, num_words=15):
print('Topic: {} \nWords: {}'.format(idx, [decarbonisation_dictionary[int(w[0])] for w in topic]))
'The Commission envisions the EU as the global hub for developing next-generation renewable energies. It aims to make the EU the world leader in the sector through preparing markets and grids for a growing proportion of renewable energy, and investing in advanced, sustainable alternative fuels.'
from matplotlib import colors
topics = decarbonisation_models[4].show_topics(formatted=False)
counter = Counter(decarbonisation_docs.sum())
out = []
for i, topic in topics:
for word, weight in topic:
word = decarbonisation_dictionary[int(word)]
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(3, 2, figsize=(14,14), sharey=True)
cols = [color for name, color in colors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
if i>=3:
i+=1
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.018); ax.set_ylim(0, 3000)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=12)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
ax.grid(False)
ax_twin.grid(False)
fig.suptitle('Topics for dimension: Decarbonisation', fontsize=16)
fig.tight_layout()
plt.show()
decarbonisation_corpus_model = decarbonisation_models[4][decarbonisation_encoded_docs]
decarbonisation_metainfo = necp_processed[(necp_processed['energy_union_dimension'] == "Decarbonisation")]
res_len = len(decarbonisation_metainfo)
res = np.zeros((res_len, 7))
for i, doc in enumerate(decarbonisation_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
decarbonisation_modeling_results = pd.concat([decarbonisation_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
decarbonisation_topic_probs = decarbonisation_modeling_results.groupby("country").mean().loc[:,[0, 1, 2, 4, 5, 6]]
decarbonisation_modeling_results.groupby("subsection").mean().loc[:,[0, 1, 2, 4, 5, 6]]
decarbonisation_topic_probs
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc
linkage = hc.linkage(decarbonisation_topic_probs, method='average', metric='cosine')
decarbonisation_similarities = sp.distance.squareform(sp.distance.pdist(decarbonisation_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-decarbonisation_similarities,
xticklabels=decarbonisation_topic_probs.index,
yticklabels=decarbonisation_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
decarbonisation_comparison = decarbonisation_modeling_results.groupby(["country", "subsection"]).mean().loc[:,0:6]
countries = decarbonisation_modeling_results.country.unique()
sections = ["Policies and Measures", "National Objectives and Targets"]
decarbonisation_change = {"country": [], "similarity": []}
for country in countries:
pm = decarbonisation_modeling_results.loc[(decarbonisation_modeling_results["country"] == country) &
(decarbonisation_modeling_results["subsection"] == sections[0])].loc[:,0:6]
noat = decarbonisation_modeling_results.loc[(decarbonisation_modeling_results["country"] == country) &
(decarbonisation_modeling_results["subsection"] == sections[1])].loc[:,0:6]
if pm.shape[0]==1:
decarbonisation_change["country"].append(country)
decarbonisation_change["similarity"].append(1-sp.distance.cosine(pm, noat))
pd.DataFrame(decarbonisation_change)
energy_efficiency_docs = necp_processed[(necp_processed['energy_union_dimension'] == "Energy efficiency")]["necp_lemmas"]
energy_efficiency_counter = Counter(energy_efficiency_docs.sum()).most_common(30)
plot_counter(energy_efficiency_counter)
plt.show()
energy_efficiency_docs = energy_efficiency_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['building', 'efficiency', 'consumption'])])
energy_efficiency_dictionary = Dictionary(energy_efficiency_docs)
energy_efficiency_dictionary.filter_extremes(no_below=2, no_above=1.0)
energy_efficiency_encoded_docs = energy_efficiency_docs.apply(energy_efficiency_dictionary.doc2bow)
energy_efficiency_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(energy_efficiency_encoded_docs, num_topics=topics_number, passes=8, iterations=100, random_state=123)
energy_efficiency_models.append(lda)
energy_efficiency_cvs = []
for model in tqdm(energy_efficiency_models):
cm = CoherenceModel(model,texts=energy_efficiency_docs, dictionary=energy_efficiency_dictionary)
c_v = cm.get_coherence()
energy_efficiency_cvs.append(c_v)
px.line(x=range(3, 13), y=energy_efficiency_cvs)
vis = pyLDAvis.gensim_models.prepare(energy_efficiency_models[8], energy_efficiency_encoded_docs, dictionary=energy_efficiency_dictionary)
vis
for idx, topic in energy_efficiency_models[8].show_topics(formatted=False, num_words=15, num_topics=11):
print('Topic: {} \nWords: {}'.format(idx, [energy_efficiency_dictionary[int(w[0])] for w in topic]))
from matplotlib import colors
topics = energy_efficiency_models[8].show_topics(formatted=False, num_topics=11)
counter = Counter(energy_efficiency_docs.sum())
out = []
for i, topic in topics:
for word, weight in topic:
word = energy_efficiency_dictionary[int(word)]
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(5, 2, figsize=(14, 20), sharey=True)
cols = [color for name, color in colors.TABLEAU_COLORS.items()]
cols.append(cols[4])
for i, ax in enumerate(axes.flatten()):
if i>=4:
i+=1
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.045);
ax.set_ylim(0, 2500)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=12)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
ax.grid(False)
ax_twin.grid(False)
fig.suptitle('Topics for dimension: Energy efficiency', fontsize=16)
fig.tight_layout()
plt.show()
energy_efficiency_corpus_model = energy_efficiency_models[8][energy_efficiency_encoded_docs]
energy_efficiency_metainfo = necp_processed[(necp_processed['energy_union_dimension'] == "Energy efficiency")]
res_len = len(energy_efficiency_metainfo)
res = np.zeros((res_len, 11))
for i, doc in enumerate(energy_efficiency_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
energy_efficiency_modeling_results = pd.concat([energy_efficiency_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
energy_efficiency_topic_probs = energy_efficiency_modeling_results.groupby("country").mean().loc[:,[0, 1, 2, 4, 5, 6, 7, 8, 9, 10]]
energy_efficiency_topic_probs
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc
linkage = hc.linkage(energy_efficiency_topic_probs, method='average', metric='cosine')
energy_efficiency_similarities = sp.distance.squareform(sp.distance.pdist(energy_efficiency_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-energy_efficiency_similarities,
xticklabels=energy_efficiency_topic_probs.index,
yticklabels=energy_efficiency_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
energy_efficiency_comparison = energy_efficiency_modeling_results.groupby(["country", "subsection"]).mean().loc[:,0:10]
countries = energy_efficiency_modeling_results.country.unique()
sections = ["Policies and Measures", "National Objectives and Targets"]
energy_efficiency_change = {"country": [], "similarity": []}
for country in countries:
pm = energy_efficiency_modeling_results.loc[(energy_efficiency_modeling_results["country"] == country) &
(energy_efficiency_modeling_results["subsection"] == sections[0])].loc[:,0:10]
noat = energy_efficiency_modeling_results.loc[(energy_efficiency_modeling_results["country"] == country) &
(energy_efficiency_modeling_results["subsection"] == sections[1])].loc[:,0:10]
if pm.shape[0]==1:
energy_efficiency_change["country"].append(country)
energy_efficiency_change["similarity"].append(1-sp.distance.cosine(pm, noat))
pd.DataFrame(energy_efficiency_change)
energy_security_docs = necp_processed[(necp_processed['energy_union_dimension'] == "Energy security")]["necp_lemmas"]
energy_security_counter = Counter(energy_security_docs.sum()).most_common(30)
plot_counter(energy_security_counter)
plt.show()
energy_security_docs = energy_security_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['gas', 'supply', 'electricity', 'system', 'security'])])
energy_security_dictionary = Dictionary(energy_security_docs)
energy_security_dictionary.filter_extremes(no_below=2, no_above=1.0)
energy_security_encoded_docs = energy_security_docs.apply(energy_security_dictionary.doc2bow)
energy_security_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(energy_security_encoded_docs, num_topics=topics_number, passes=8, iterations=100, random_state=123)
energy_security_models.append(lda)
energy_security_cvs = []
for model in tqdm(energy_security_models):
cm = CoherenceModel(model,texts=energy_security_docs, dictionary=energy_security_dictionary)
c_v = cm.get_coherence()
energy_security_cvs.append(c_v)
px.line(x=range(3, 13), y=energy_security_cvs)
vis = pyLDAvis.gensim_models.prepare(energy_security_models[7], energy_security_encoded_docs, dictionary=energy_security_dictionary)
vis
for idx, topic in energy_security_models[7].show_topics(formatted=False, num_words=15):
print('Topic: {} \nWords: {}'.format(idx, [energy_security_dictionary[int(w[0])] for w in topic]))
from matplotlib import colors
topics = energy_security_models[7].show_topics(formatted=False)
counter = Counter(energy_security_docs.sum())
out = []
for i, topic in topics:
for word, weight in topic:
word = energy_security_dictionary[int(word)]
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(3, 3, figsize=(21,12), sharey=True)
cols = [color for name, color in colors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
if i>=1:
i+=1
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.025);
ax.set_ylim(0, 2500)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=12)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
ax.grid(False)
ax_twin.grid(False)
fig.suptitle('Topics for dimension: Energy Security', fontsize=16)
fig.tight_layout()
plt.show()
energy_security_corpus_model = energy_security_models[7][energy_security_encoded_docs]
energy_security_metainfo = necp_processed[(necp_processed['energy_union_dimension'] == "Energy security")]
res_len = len(energy_security_metainfo)
res = np.zeros((res_len, 10))
for i, doc in enumerate(energy_security_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
energy_security_modeling_results = pd.concat([energy_security_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
energy_security_topic_probs = energy_security_modeling_results.groupby("country").mean().loc[:,[0, 1, 2, 4, 5, 6, 7, 8, 9]]
energy_security_topic_probs
linkage = hc.linkage(energy_security_topic_probs, method='average', metric='cosine')
energy_security_similarities = sp.distance.squareform(sp.distance.pdist(energy_security_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-energy_security_similarities,
xticklabels=energy_security_topic_probs.index,
yticklabels=energy_security_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
energy_security_comparison = energy_security_modeling_results.groupby(["country", "subsection"]).mean().loc[:,0:9]
countries = energy_security_modeling_results.country.unique()
sections = ["Policies and Measures", "National Objectives and Targets"]
energy_security_change = {"country": [], "similarity": []}
for country in countries:
pm = energy_security_modeling_results.loc[(energy_security_modeling_results["country"] == country) &
(energy_security_modeling_results["subsection"] == sections[0])].loc[:,0:9]
noat = energy_security_modeling_results.loc[(energy_security_modeling_results["country"] == country) &
(energy_security_modeling_results["subsection"] == sections[1])].loc[:,0:9]
if pm.shape[0]==1:
energy_security_change["country"].append(country)
energy_security_change["similarity"].append(1-sp.distance.cosine(pm, noat))
pd.DataFrame(energy_security_change)
internal_market_docs = necp_processed[(necp_processed['energy_union_dimension'] == "Internal market")]["necp_lemmas"]
internal_market_counter = Counter(internal_market_docs.sum()).most_common(30)
plot_counter(internal_market_counter)
plt.show()
internal_market_docs = internal_market_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['electricity', 'market', 'gas', 'system'])])
internal_market_dictionary = Dictionary(internal_market_docs)
internal_market_dictionary.filter_extremes(no_below=2, no_above=1.0)
internal_market_encoded_docs = internal_market_docs.apply(internal_market_dictionary.doc2bow)
internal_market_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(internal_market_encoded_docs, num_topics=topics_number, passes=8, iterations=100, random_state=123)
internal_market_models.append(lda)
internal_market_cvs = []
for model in tqdm(internal_market_models):
cm = CoherenceModel(model,texts=internal_market_docs, dictionary=internal_market_dictionary)
c_v = cm.get_coherence()
internal_market_cvs.append(c_v)
px.line(x=range(3, 13), y=internal_market_cvs)
vis = pyLDAvis.gensim_models.prepare(internal_market_models[2], internal_market_encoded_docs, dictionary=internal_market_dictionary)
vis
for idx, topic in internal_market_models[2].show_topics(formatted=False, num_words=15):
print('Topic: {} \nWords: {}'.format(idx, [internal_market_dictionary[int(w[0])] for w in topic]))
from matplotlib import colors
topics = internal_market_models[2].show_topics(formatted=False)
counter = Counter(internal_market_docs.sum())
out = []
for i, topic in topics:
for word, weight in topic:
word = internal_market_dictionary[int(word)]
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(2, 2, figsize=(14,8), sharey=True)
cols = [color for name, color in colors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
if i>=2:
i+=1
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.02);
ax.set_ylim(0, 2500)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=12)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
ax.grid(False)
ax_twin.grid(False)
fig.suptitle('Topics for dimension: Internal market', fontsize=16)
fig.tight_layout()
plt.show()
internal_market_corpus_model = internal_market_models[2][internal_market_encoded_docs]
internal_market_metainfo = necp_processed[(necp_processed['energy_union_dimension'] == "Internal market")]
res_len = len(internal_market_metainfo)
res = np.zeros((res_len, 5))
for i, doc in enumerate(internal_market_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
internal_market_modeling_results = pd.concat([internal_market_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
internal_market_topic_probs = internal_market_modeling_results.groupby("country").mean().loc[:,[0, 1, 2, 4]]
internal_market_topic_probs
linkage = hc.linkage(internal_market_topic_probs, method='average', metric='cosine')
internal_market_similarities = sp.distance.squareform(sp.distance.pdist(internal_market_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-internal_market_similarities,
xticklabels=internal_market_topic_probs.index,
yticklabels=internal_market_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
internal_market_comparison = internal_market_modeling_results.groupby(["country", "subsection"]).mean().loc[:,0:4]
countries = internal_market_modeling_results.country.unique()
sections = ["Policies and Measures", "National Objectives and Targets"]
internal_market_change = {"country": [], "similarity": []}
for country in countries:
pm = internal_market_modeling_results.loc[(internal_market_modeling_results["country"] == country) &
(internal_market_modeling_results["subsection"] == sections[0])].loc[:,0:4]
noat = internal_market_modeling_results.loc[(internal_market_modeling_results["country"] == country) &
(internal_market_modeling_results["subsection"] == sections[1])].loc[:,0:4]
if pm.shape[0]==1:
internal_market_change["country"].append(country)
internal_market_change["similarity"].append(1-sp.distance.cosine(pm, noat))
pd.DataFrame(internal_market_change)
research_docs = necp_processed[(necp_processed['energy_union_dimension'] == "R&I and Competitiveness")]["necp_lemmas"]
research_counter = Counter(research_docs.sum()).most_common(30)
plot_counter(research_counter)
plt.show()
research_docs = research_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['research'])])
research_dictionary = Dictionary(research_docs)
research_dictionary.filter_extremes(no_below=2, no_above=1.0)
research_encoded_docs = research_docs.apply(research_dictionary.doc2bow)
research_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(research_encoded_docs, num_topics=topics_number, passes=10, iterations=80, random_state=42)
research_models.append(lda)
research_cvs = []
for model in tqdm(research_models):
cm = CoherenceModel(model,texts=research_docs, dictionary=research_dictionary)
c_v = cm.get_coherence()
research_cvs.append(c_v)
px.line(x=range(3, 13), y=research_cvs)
vis = pyLDAvis.gensim_models.prepare(research_models[1], research_encoded_docs, dictionary=research_dictionary)
vis
for idx, topic in research_models[1].show_topics(formatted=False, num_words=15):
print('Topic: {} \nWords: {}'.format(idx, [research_dictionary[int(w[0])] for w in topic]))
from matplotlib import colors
topics = research_models[1].show_topics(formatted=False)
counter = Counter(research_docs.sum())
out = []
for i, topic in topics:
for word, weight in topic:
word = research_dictionary[int(word)]
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(2, 2, figsize=(14,8), sharey=True)
cols = [color for name, color in colors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.02); ax.set_ylim(0, 2500)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=12)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
ax.grid(False)
ax_twin.grid(False)
fig.suptitle('Topics for dimension: R&I and Competitiveness', fontsize=16)
fig.tight_layout()
plt.show()
research_corpus_model = research_models[1][research_encoded_docs]
research_metainfo = necp_processed[(necp_processed['energy_union_dimension'] == "R&I and Competitiveness")]
res_len = len(research_metainfo)
res = np.zeros((res_len, 4))
for i, doc in enumerate(research_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
research_modeling_results = pd.concat([research_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
research_topic_probs = research_modeling_results.groupby("country").mean().loc[:,[0, 1, 2, 3]]
research_topic_probs
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc
linkage = hc.linkage(research_topic_probs, method='average', metric='cosine')
research_similarities = sp.distance.squareform(sp.distance.pdist(research_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-research_similarities,
xticklabels=research_topic_probs.index,
yticklabels=research_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
research_comparison = research_modeling_results.groupby(["country", "subsection"]).mean().loc[:,0:3]
countries = research_modeling_results.country.unique()
sections = ["Policies and Measures", "National Objectives and Targets"]
research_change = {"country": [], "similarity": []}
for country in countries:
pm = research_modeling_results.loc[(research_modeling_results["country"] == country) &
(research_modeling_results["subsection"] == sections[0])].loc[:,0:3]
noat = research_modeling_results.loc[(research_modeling_results["country"] == country) &
(research_modeling_results["subsection"] == sections[1])].loc[:,0:3]
if pm.shape[0]==1:
research_change["country"].append(country)
research_change["similarity"].append(1-sp.distance.cosine(pm, noat))
pd.DataFrame(research_change)
overview_docs = necp_processed[(necp_processed['subsection'] == "Overview and Process for Establishing the Plan")]["necp_lemmas"]
overview_counter = Counter(overview_docs.sum()).most_common(30)
plot_counter(overview_counter)
plt.show()
overview_docs = overview_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['emission', 'renewable'])])
overview_dictionary = Dictionary(overview_docs)
overview_dictionary.filter_extremes(no_below=2, no_above=1.0)
overview_encoded_docs = overview_docs.apply(overview_dictionary.doc2bow)
overview_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(overview_encoded_docs, num_topics=topics_number, passes=8, iterations=100, random_state=123)
overview_models.append(lda)
overview_cvs = []
for model in tqdm(overview_models):
cm = CoherenceModel(model,texts=overview_docs, dictionary=overview_dictionary)
c_v = cm.get_coherence()
overview_cvs.append(c_v)
px.line(x=range(3, 13), y=overview_cvs)
vis = pyLDAvis.gensim_models.prepare(overview_models[0], overview_encoded_docs, dictionary=overview_dictionary)
vis
for idx, topic in overview_models[0].show_topics(formatted=False, num_words=15):
print('Topic: {} \nWords: {}'.format(idx, [overview_dictionary[int(w[0])] for w in topic]))
overview_corpus_model = overview_models[0][overview_encoded_docs]
overview_metainfo = necp_processed[(necp_processed['subsection'] == "Overview and Process for Establishing the Plan")]
res_len = len(overview_metainfo)
res = np.zeros((res_len, 3))
for i, doc in enumerate(overview_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
overview_modeling_results = pd.concat([overview_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
overview_topic_probs = overview_modeling_results.groupby("country").mean().loc[:,[0, 1, 2]]
overview_topic_probs
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc
linkage = hc.linkage(overview_topic_probs, method='average', metric='cosine')
overview_similarities = sp.distance.squareform(sp.distance.pdist(overview_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-overview_similarities,
xticklabels=overview_topic_probs.index,
yticklabels=overview_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
from matplotlib import colors
topics = overview_models[0].show_topics(formatted=False)
counter = Counter(overview_docs.sum())
out = []
for i, topic in topics:
for word, weight in topic:
word = overview_dictionary[int(word)]
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(1, 3, figsize=(21, 4), sharey=True)
cols = [color for name, color in colors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.0125); ax.set_ylim(0, 1500)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=12)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
ax.grid(False)
ax_twin.grid(False)
fig.suptitle('Topics for Overview and Process for Establishing the Plan', fontsize=16)
fig.tight_layout()
plt.show()
overview_docs = overview_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['electricity', 'gas', 'renewable', 'emission'])])
overview_dictionary = Dictionary(overview_docs)
overview_dictionary.filter_extremes(no_below=2, no_above=1.0)
overview_encoded_docs = overview_docs.apply(overview_dictionary.doc2bow)
overview_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(overview_encoded_docs, num_topics=topics_number, passes=8, iterations=100, random_state=123)
overview_models.append(lda)
overview_cvs = []
for model in tqdm(overview_models):
cm = CoherenceModel(model,texts=overview_docs, dictionary=overview_dictionary)
c_v = cm.get_coherence()
overview_cvs.append(c_v)
px.line(x=range(3, 13), y=overview_cvs)
vis = pyLDAvis.gensim_models.prepare(overview_models[6], overview_encoded_docs, dictionary=overview_dictionary)
vis
for idx, topic in overview_models[6].show_topics(formatted=False, num_words=15):
print('Topic: {} \nWords: {}'.format(idx, [overview_dictionary[int(w[0])] for w in topic]))
overview_corpus_model = overview_models[6][overview_encoded_docs]
overview_metainfo = necp_processed[(necp_processed['subsection'] == "Overview and Process for Establishing the Plan")]
res_len = len(overview_metainfo)
res = np.zeros((res_len, 10))
for i, doc in enumerate(overview_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
overview_modeling_results = pd.concat([overview_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
overview_topic_probs = overview_modeling_results.groupby("country").mean().loc[:,[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
overview_topic_probs
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc
linkage = hc.linkage(overview_topic_probs, method='average', metric='cosine')
overview_similarities = sp.distance.squareform(sp.distance.pdist(overview_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-overview_similarities,
xticklabels=overview_topic_probs.index,
yticklabels=overview_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
necp_processed
impact_docs = necp_processed[(necp_processed['subsection'] == "Impact Assessment of Planned Policies and Measures")]["necp_lemmas"]
impact_counter = Counter(impact_docs.sum()).most_common(30)
plot_counter(impact_counter)
plt.show()
impact_docs = impact_docs.apply(lambda doc: [lemma for lemma in doc if not (lemma in ['emission', 'scenario'])])
impact_dictionary = Dictionary(impact_docs)
impact_dictionary.filter_extremes(no_below=2, no_above=1.0)
impact_encoded_docs = impact_docs.apply(impact_dictionary.doc2bow)
impact_models = []
for topics_number in tqdm(range(3, 13)):
lda = LdaMulticore(impact_encoded_docs, num_topics=topics_number, passes=8, iterations=100, random_state=123)
impact_models.append(lda)
impact_cvs = []
for model in tqdm(impact_models):
cm = CoherenceModel(model,texts=impact_docs, dictionary=impact_dictionary)
c_v = cm.get_coherence()
impact_cvs.append(c_v)
px.line(x=range(3, 13), y=impact_cvs)
vis = pyLDAvis.gensim_models.prepare(impact_models[6], impact_encoded_docs, dictionary=impact_dictionary)
vis
for idx, topic in impact_models[6].show_topics(formatted=False, num_words=15):
print('Topic: {} \nWords: {}'.format(idx, [impact_dictionary[int(w[0])] for w in topic]))
impact_corpus_model = impact_models[6][impact_encoded_docs]
impact_metainfo = necp_processed[(necp_processed['subsection'] == "Impact Assessment of Planned Policies and Measures")]
res_len = len(impact_metainfo)
res = np.zeros((res_len, 9))
for i, doc in enumerate(impact_corpus_model):
for topic in doc:
res[i][topic[0]] = np.round(topic[1], 4)
impact_modeling_results = pd.concat([impact_metainfo.reset_index(drop=True), pd.DataFrame(res)], axis=1)
impact_topic_probs = impact_modeling_results.groupby("country").mean().loc[:,[0, 1, 2, 3, 4, 5, 6, 7, 8]]
impact_topic_probs
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc
linkage = hc.linkage(overview_topic_probs, method='average', metric='cosine')
impact_similarities = sp.distance.squareform(sp.distance.pdist(overview_topic_probs.values, metric='cosine'))
plt.figure(figsize=(12, 8))
sns.clustermap(1-impact_similarities,
xticklabels=overview_topic_probs.index,
yticklabels=overview_topic_probs.index,
row_linkage=linkage, col_linkage=linkage)
plt.show()
from matplotlib import colors
topics = impact_models[6].show_topics(formatted=False)
counter = Counter(impact_docs.sum())
out = []
for i, topic in topics:
for word, weight in topic:
word = impact_dictionary[int(word)]
out.append([word, i , weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
fig, axes = plt.subplots(3, 3, figsize=(21,12), sharey=True)
cols = [color for name, color in colors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i])
ax_twin.set_ylim(0, 0.025); ax.set_ylim(0, 1500)
ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=12)
ax.tick_params(axis='y', left=False)
ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')
ax.grid(False)
ax_twin.grid(False)
fig.suptitle('Topics for Impact Assessment of Planned Policies and Measures', fontsize=16)
fig.tight_layout()
plt.show()